# Helper libraries
import numpy as np
import pandas as pd
from time import time
from collections import Counter
import matplotlib
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls
from sklearn import metrics
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.cluster import homogeneity_score
def warn(*args, **kwargs):
pass
import warnings
warnings.warn = warn
training_df = pd.read_csv("/Users/gurjy/Downloads/train.csv")
print(training_df.shape)
#save label in different variable
target = training_df['label']
# Drop the label feature
training_df.drop("label",axis=1,inplace=True)
target.shape
X = training_df.values
#scale features
X_std = StandardScaler().fit_transform(X)
# Calculating Eigenvectors and eigenvalues of Cov matirx
mean_vec = np.mean(X_std, axis=0)
cov_mat = np.cov(X_std.T)
eig_vals, eig_vecs = np.linalg.eig(cov_mat)
# Create a list of (eigenvalue, eigenvector) tuples
eig_pairs = [ (np.abs(eig_vals[i]),eig_vecs[:,i]) for i in range(len(eig_vals))]
# Sort the eigenvalue, eigenvector pair from high to low
eig_pairs.sort(key = lambda x: x[0], reverse= True)
# Calculation of Explained Variance from the eigenvalues
tot = sum(eig_vals)
var_exp = [(i/tot)*100 for i in sorted(eig_vals, reverse=True)] # Individual explained variance
cum_var_exp = np.cumsum(var_exp) # Cumulative explained variance
X = training_df.values
X_std = StandardScaler().fit_transform(X)
mean_vec=np.mean(X_std,axis=0)
cov_mat=np.cov(X_std.T)
eigvalues ,eigvectors =np.linalg.eig(cov_mat)
eigpairs=[(np.abs(eigvalues[i]),eigvectors[:,i] )for i in range(len(eigvalues))]
eigpairs.sort(key=lambda x:x[0],reverse=True)
tot=sum(eigvalues)
var_exp=[(i/tot)*100 for i in sorted(eigvalues,reverse=True)]
cum_var_exp=np.cumsum(var_exp)
#shows how many PCAs are good to use to explain data in lower dimension
trace1 = go.Scatter(
x=list(range(784)),
y= cum_var_exp,
mode='lines+markers',
name="'Cumulative Explained Variance'",
line = dict(
shape='spline',
color = 'goldenrod'
)
)
trace2 = go.Scatter(
x=list(range(784)),
y= var_exp,
mode='lines+markers',
name="'Individual Explained Variance'",
line = dict(
shape='linear',
color = 'black'
)
)
fig = tls.make_subplots(insets=[{'cell': (1,1), 'l': 0.7, 'b': 0.5}],
print_grid=True)
fig.append_trace(trace1,1,1)
fig.append_trace(trace2,1,1)
fig.layout.title='explained Variance plots'
fig.layout.xaxis=dict(range=[0,800],title='Feature columns')
fig.layout.yaxis=dict(range=[0,100],title='explained variance')
py.iplot(fig,filename='inset example')
#make 30 PCAs
pca=PCA(30)
pca.fit(X_std)
X_pca=pca.transform(X_std)
X_pca.shape
X_std.shape
eigenvectors=pca.components_
eigenvectors.shape
plt.figure(figsize=(17,16))
x_row=4
y_col=7
for i in list(range(x_row*y_col)):
plt.subplot(x_row,y_col,i+1)
plt.imshow(eigenvectors[i].reshape(28,28),cmap='twilight_shifted')
title_='Eigenvector'+str(i+1)
plt.title(title_)
plt.xticks(())
plt.yticks(())
plt.show()
plt.figure(figsize=(12,13))
for i in list(range(0,70)):
plt.subplot(7,10,i+1)
plt.title(target[i])
plt.imshow(training_df.iloc[i].values.reshape(28,28), interpolation = "none", cmap='binary')
plt.xticks([])
plt.yticks([])
plt.tight_layout()
plt.tight_layout
#140 PCS explain 80% of data
X_=training_df
X_std_=StandardScaler().fit_transform(X_)
pca_=PCA(140)
X_140d=pca_.fit_transform(X_std_)
Target=target
trace = go.Scatter(
x = X_140d[:,0],
y = X_140d[:,1],
name = str(Target),
mode = 'markers',
text = Target,
showlegend = False,
marker = dict(
size = 8,
color = Target,
colorscale ='Jet',
showscale = False,
line = dict(
width = 2,
color = 'rgb(255, 255, 255)'
),
opacity = 0.8
)
)
data=[trace]
layout=go.Layout(title='PCA',
hovermode='closest',
xaxis=dict(
title='First principal direction',
ticklen=5,
zeroline=False),
yaxis=dict(
title='Second principal direction',
ticklen=5
),
showlegend=True
)
fig=dict(data=data,layout=layout)
py.iplot(fig,filename='pca')
#now see how good clustering does using PCs
kmeans=KMeans(10)
X_clustered140=kmeans.fit_predict(X_140d)
tracekmeans = go.Scatter(x=X_140d[:, 0], y= X_140d[:, 1], mode="markers",
showlegend=False,
marker=dict(
size=8,
color = X_clustered140,
colorscale = 'Portland',
showscale=False,
line = dict(
width = 2,
color = 'rgb(255, 255, 255)'
)
))
layout=go.Layout(title='Kmeans clustering',
hovermode='closest',
xaxis=dict(title='first principal direction',
ticklen=5,
zeroline=False,
gridwidth=2),
yaxis=dict(title='second principal component',
ticklen=5,
gridwidth=2),
showlegend=True
)
data = [tracekmeans]
fig1 = dict(data=data, layout= layout)
py.iplot(fig1, filename="svm")
x_clusters_df=pd.DataFrame(X_clustered140, dtype=int)
x_clusters_df.columns=['Cluster']
targeted_df=pd.DataFrame(Target,dtype=int)
pd.crosstab(targeted_df.label, x_clusters_df.Cluster)
#use three metrics
homogeneity_score(Target, X_clustered140)
metrics.silhouette_score(X_140d, X_clustered140)
metrics.completeness_score(Target, X_clustered140)
#do same thing with 319 PCs and 784 PCs
X_=training_df
X_std_=StandardScaler().fit_transform(X_)
pca_=PCA(319)
X_319d=pca_.fit_transform(X_std_)
Target=target
trace = go.Scatter(
x = X_319d[:,0],
y = X_319d[:,1],
name = str(Target),
mode = 'markers',
text = Target,
showlegend = False,
marker = dict(
size = 8,
color = Target,
colorscale ='Jet',
showscale = False,
line = dict(
width = 2,
color = 'rgb(255, 255, 255)'
),
opacity = 0.8
)
)
data=[trace]
layout=go.Layout(title='PCA',
hovermode='closest',
xaxis=dict(
title='First principal direction',
ticklen=5,
zeroline=False),
yaxis=dict(
title='Second principal direction',
ticklen=5
),
showlegend=True
)
fig=dict(data=data,layout=layout)
py.iplot(fig,filename='pca')
kmeans=KMeans(10)
X_clustered319=kmeans.fit_predict(X_319d)
tracekmeans = go.Scatter(x=X_319d[:, 0], y= X_319d[:, 1], mode="markers",
showlegend=False,
marker=dict(
size=8,
color = X_clustered319,
colorscale = 'Portland',
showscale=False,
line = dict(
width = 2,
color = 'rgb(255, 255, 255)'
)
))
layout=go.Layout(title='Kmeans clustering',
hovermode='closest',
xaxis=dict(title='first principal direction',
ticklen=5,
zeroline=False,
gridwidth=2),
yaxis=dict(title='second principal component',
ticklen=5,
gridwidth=2),
showlegend=True
)
data = [tracekmeans]
fig1 = dict(data=data, layout= layout)
py.iplot(fig1, filename="svm")
x_clusters_df=pd.DataFrame(X_clustered319, dtype=int)
x_clusters_df.columns=['Cluster']
targeted_df=pd.DataFrame(Target,dtype=int)
pd.crosstab(targeted_df.label, x_clusters_df.Cluster)
homogeneity_score(Target, X_clustered319)
metrics.silhouette_score(X_319d, X_clustered319)
metrics.completeness_score(Target, X_clustered319)
X_=training_df
X_std_=StandardScaler().fit_transform(X_)
pca_=PCA(784)
X_784d=pca_.fit_transform(X_std_)
Target=target
trace = go.Scatter(
x = X_784d[:,0],
y = X_784d[:,1],
name = str(Target),
mode = 'markers',
text = Target,
showlegend = False,
marker = dict(
size = 8,
color = Target,
colorscale ='Jet',
showscale = False,
line = dict(
width = 2,
color = 'rgb(255, 255, 255)'
),
opacity = 0.8
)
)
data=[trace]
layout=go.Layout(title='PCA',
hovermode='closest',
xaxis=dict(
title='First principal direction',
ticklen=5,
zeroline=False),
yaxis=dict(
title='Second principal direction',
ticklen=5
),
showlegend=True
)
fig=dict(data=data,layout=layout)
py.iplot(fig,filename='pca')
kmeans=KMeans(10)
X_clustered784=kmeans.fit_predict(X_784d)
tracekmeans = go.Scatter(x=X_784d[:, 0], y= X_784d[:, 1], mode="markers",
showlegend=False,
marker=dict(
size=8,
color = X_clustered784,
colorscale = 'Portland',
showscale=False,
line = dict(
width = 2,
color = 'rgb(255, 255, 255)'
)
))
layout=go.Layout(title='Kmeans clustering',
hovermode='closest',
xaxis=dict(title='first principal direction',
ticklen=5,
zeroline=False,
gridwidth=2),
yaxis=dict(title='second principal component',
ticklen=5,
gridwidth=2),
showlegend=True
)
data = [tracekmeans]
fig1 = dict(data=data, layout= layout)
py.iplot(fig1, filename="svm")
x_clusters_df=pd.DataFrame(X_clustered784, dtype=int)
x_clusters_df.columns=['Cluster']
targeted_df=pd.DataFrame(Target,dtype=int)
pd.crosstab(targeted_df.label, x_clusters_df.Cluster)
homogeneity_score(Target, X_clustered784)
metrics.silhouette_score(X_784d, X_clustered784)
metrics.completeness_score(Target, X_clustered784)
tracekmeans = go.Scatter(x=X_784d[:, 0], y= X_784d[:, 1], mode="markers",
showlegend=False,
marker=dict(
size=8,
color = X_clustered784,
colorscale = 'Portland',
showscale=False,
line = dict(
width = 2,
color = 'rgb(255, 255, 255)'
)
))
layout=go.Layout(title='Kmeans clustering',
hovermode='closest',
xaxis=dict(title='first principal direction',
ticklen=5,
zeroline=False,
gridwidth=2),
yaxis=dict(title='second principal component',
ticklen=5,
gridwidth=2),
showlegend=True
)
data = [tracekmeans]
fig1 = dict(data=data, layout= layout)
py.iplot(fig1, filename="svm")
#DO KMEANS WITHOUT PCS using direct 784 features
kmeans=KMeans(10)
X_clustered=kmeans.fit_predict(training_df)
x_clusters_df=pd.DataFrame(X_clustered, dtype=int)
x_clusters_df.columns=['Cluster']
targeted_df=pd.DataFrame(Target,dtype=int)
pd.crosstab(targeted_df.label, x_clusters_df.Cluster)
homogeneity_score(Target, X_clustered)
metrics.silhouette_score(X_, X_clustered)
metrics.completeness_score(Target, X_clustered)